Below is a list of the top comutating genes with KRAS in CRC. The last four columns have the number of tunor samples with the various combination of mutations; for example, G mut & K WT has the number of tumors with the other gene (G) mutated and KRAS (K) WT.
nonallele_specific_increased_comutation_df %.% {
filter(cancer == "COAD" & hugo_symbol != "KRAS")
filter(p_value < 0.01)
arrange(p_value)
mutate(
geneWT_krasWT = map_dbl(comut_ct_tbl, ~ .x[1, 1]),
geneMut_krasWT = map_dbl(comut_ct_tbl, ~ .x[2, 1]),
geneWT_krasMut = map_dbl(comut_ct_tbl, ~ .x[1, 2]),
geneMut_krasMut = map_dbl(comut_ct_tbl, ~ .x[2, 2]),
)
select(hugo_symbol, p_value, odds_ratio, tidyselect::starts_with("gene"))
rename(
gene = hugo_symbol,
`p-value` = p_value,
`odds ratio` = odds_ratio,
`G WT & K WT` = geneWT_krasWT,
`G mut & K WT` = geneMut_krasWT,
`G WT & K mut` = geneWT_krasMut,
`G mut & K mut` = geneMut_krasMut
)
}
The data used for this analysis was the RNA expression data from TCGA-COAD. This data set has RNA expression for 525 tumor samples. 78 of these samples are hypermutants; these samples were removed from the analysis.
main_alleles <- c("WT", "KRAS_G12A", "KRAS_G12C", "KRAS_G12D", "KRAS_G12V", "KRAS_G13D")
dusp_rna_data <- tcga_coad_rna %.% {
filter(str_detect(hugo_symbol, "DUSP"))
filter(!is_hypermutant)
select(-is_hypermutant)
mutate(
allele = ifelse(ras_allele %in% !!main_alleles, ras_allele, "Other"),
allele = str_remove(allele, "KRAS_"),
allele = factor_alleles(allele)
)
}
# Put DUSP genes in order.
dusp_order <- unique(dusp_rna_data$hugo_symbol)
dusp_num <- as.numeric(str_remove_all(dusp_order, "[:alpha:]"))
dusp_order <- dusp_order[order(dusp_num)]
dusp_rna_data$hugo_symbol <- factor(dusp_rna_data$hugo_symbol, levels = dusp_order)
Below is a sample of the RNA expression data table.
dusp_rna_data %>%
rename(
`DUSP` = hugo_symbol,
`tumor sample` = tumor_sample_barcode,
`RNA (RSEM)` = rna_expr
) %>%
select(-ras_allele) %>%
head() %>%
kbl() %>%
kable_styling(bootstrap_options = c("striped", "hover"))
| DUSP | tumor sample | RNA (RSEM) | allele |
|---|---|---|---|
| DUSP10 | TCGA-3L-AA1B | 254.352 | WT |
| DUSP10 | TCGA-4N-A93T | 133.527 | G12D |
| DUSP10 | TCGA-4T-AA8H | 275.635 | G12V |
| DUSP10 | TCGA-5M-AAT4 | 507.745 | G12D |
| DUSP10 | TCGA-5M-AAT5 | 572.762 | G12D |
| DUSP10 | TCGA-5M-AATA | 209.145 | WT |
The number of tumor samples with missing data for each DUSP gene.
dusp_rna_data %>%
filter(is.na(rna_expr)) %>%
count(hugo_symbol, sort = TRUE) %>%
rename(DUSP = hugo_symbol, num = n) %>%
kbl() %>%
kable_styling(
bootstrap_options = c("striped", "hover"),
full_width = FALSE,
position = "left"
)
| DUSP | num |
|---|---|
| DUSP13 | 149 |
| DUSP21 | 149 |
The number of tumor samples with 0 RNA expression values for each DUSP gene.
dusp_rna_data %>%
filter(rna_expr <= 0) %>%
count(hugo_symbol) %>%
rename(DUSP = hugo_symbol, num = n) %>%
kbl() %>%
kable_styling(
bootstrap_options = c("striped", "hover"),
full_width = FALSE,
position = "left"
)
| DUSP | num |
|---|---|
| DUSP5P | 73 |
| DUSP9 | 34 |
| DUSP13 | 217 |
| DUSP15 | 2 |
| DUSP21 | 284 |
| DUSP26 | 58 |
| DUSP27 | 11 |
All negative RNA expressionvalues were set to 0.
dusp_rna_data %<>% mutate(rna_expr = map_dbl(rna_expr, ~ max(0, .x)))
dusp_rna_data %>%
filter(!is.na(rna_expr)) %>%
mutate(rna_expr = rna_expr + 1) %>%
ggplot(aes(x = allele, y = rna_expr)) +
facet_wrap(~hugo_symbol, scales = "free_y") +
geom_boxplot(outlier.shape = NA) +
scale_y_continuous(trans = "log10") +
theme(
panel.grid.major.x = element_blank(),
axis.text = element_text(size = 8),
axis.text.x = element_text(angle = 45, hjust = 1)
) +
labs(x = NULL, y = "RNA expression (log10 + 1)")
DUSP13 and DUSP21 were removed from the analysis because they were missing data and had very low expression levels.
IGNORE_DUSPS <- c("DUSP13", "DUSP21")
dusp_rna_data %<>% filter(!hugo_symbol %in% IGNORE_DUSPS)
plot_dusp_distribtions <- function(df, x) {
df %>%
ggplot(aes(x = {{ x }})) +
facet_wrap(~hugo_symbol, scales = "free") +
scale_y_continuous(expand = expansion(c(0, 0.02))) +
geom_density() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
axis.text = element_text(size = 7),
axis.text.x = element_text(angle = 30, hjust = 1),
strip.text = element_text(size = 7),
panel.spacing = unit(1, "mm")
) +
labs(x = "RNA expression")
}
plot_dusp_distribtions(dusp_rna_data, rna_expr) +
ggtitle("Non-transformed RNA expression values")
dusp_rna_data %>%
mutate(rna_expr = log10(rna_expr + 1)) %>%
plot_dusp_distribtions(rna_expr) +
ggtitle("log10(RNA + 1) transformed data")
dusp_rna_data %>%
mutate(rna_expr = sqrt(rna_expr)) %>%
plot_dusp_distribtions(rna_expr) +
ggtitle("sqrt(RNA) transformed data")
dusp_rna_data %>%
group_by(hugo_symbol) %>%
mutate(rna_expr = scale_numeric(rna_expr)) %>%
ungroup() %>%
plot_dusp_distribtions(rna_expr) +
ggtitle("Centralized and scaled (z-scale) data")
dusp_rna_data %>%
mutate(rna_expr = sqrt(rna_expr)) %>%
group_by(hugo_symbol) %>%
mutate(rna_expr = scale_numeric(rna_expr)) %>%
ungroup() %>%
plot_dusp_distribtions(rna_expr) +
ggtitle("sqrt(RNA) & z-scaled data")
The \(\log_10(\text{RNA} + 1)\), centralized, and scaled values will be used for the analysis.
dusp_rna_data %<>%
mutate(log10_rna_expr = log10(rna_expr + 1)) %>%
group_by(hugo_symbol) %>%
mutate(log10_z_rna = scale_numeric(log10_rna_expr)) %>%
ungroup()
dusp_corr <- dusp_rna_data %>%
pivot_wider(id = tumor_sample_barcode, names_from = hugo_symbol, values_from = log10_z_rna) %>%
select(-tumor_sample_barcode) %>%
corrr::correlate()
#>
#> Correlation method: 'pearson'
#> Missing treated using: 'pairwise.complete.obs'
dusp_corr_pheat <- dusp_corr %>%
as.data.frame() %>%
column_to_rownames("rowname")
colnames(dusp_corr_pheat) <- str_remove(colnames(dusp_corr_pheat), "DUSP")
rownames(dusp_corr_pheat) <- str_remove(rownames(dusp_corr_pheat), "DUSP")
pheatmap::pheatmap(
dusp_corr_pheat,
border_color = NA,
na_col = "white",
main = "Correlation of DUSP gene expression",
)
corrr::network_plot(dusp_corr, min_cor = 0.3) +
theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
labs(title = "DUSP gene expression correlation network")
new_allele_order <- as.character(sort(unique(dusp_rna_data$allele)))
new_allele_order <- c("WT", new_allele_order[new_allele_order != "WT"])
data <- dusp_rna_data %>%
mutate(
grp_allele = case_when(
allele == "G13D" ~ "G13D",
allele == "WT" ~ "WT",
TRUE ~ "KRAS"
),
grp_allele = factor(grp_allele, levels = c("WT", "G13D", "KRAS")),
allele = factor(as.character(allele), levels = new_allele_order)
)
# FOR TESTING
# set.seed(0)
# TESTING_DUSPS <- paste0("DUSP", 1:6)
# TESTING_TSBS <- sample(unique(data$tumor_sample_barcode), 100)
# data <- data %.% {
# filter(hugo_symbol %in% TESTING_DUSPS)
# filter(tumor_sample_barcode %in% TESTING_TSBS)
# }
stash("lm_stan_hier", depends_on = "data", {
lm_stan_hier <- stan_glmer(
log10_z_rna ~ 1 + (1 + allele | hugo_symbol),
data = data,
prior = normal(location = 0, scale = 5),
prior_intercept = normal(location =0, scale = 2),
prior_aux = exponential(rate = 1),
prior_covariance = decov(regularization = 1, concentration = 1, shape = 1, scale = 1)
)
})
#> Updating stash.
#>
#> SAMPLING FOR MODEL 'continuous' NOW (CHAIN 1).
#> Chain 1:
#> Chain 1: Gradient evaluation took 0.01 seconds
#> Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 100 seconds.
#> Chain 1: Adjust your expectations accordingly!
#> Chain 1:
#> Chain 1:
#> Chain 1: Iteration: 1 / 2000 [ 0%] (Warmup)
#> Chain 1: Iteration: 200 / 2000 [ 10%] (Warmup)
#> Chain 1: Iteration: 400 / 2000 [ 20%] (Warmup)
#> Chain 1: Iteration: 600 / 2000 [ 30%] (Warmup)
#> Chain 1: Iteration: 800 / 2000 [ 40%] (Warmup)
#> Chain 1: Iteration: 1000 / 2000 [ 50%] (Warmup)
#> Chain 1: Iteration: 1001 / 2000 [ 50%] (Sampling)
#> Chain 1: Iteration: 1200 / 2000 [ 60%] (Sampling)
#> Chain 1: Iteration: 1400 / 2000 [ 70%] (Sampling)
#> Chain 1: Iteration: 1600 / 2000 [ 80%] (Sampling)
#> Chain 1: Iteration: 1800 / 2000 [ 90%] (Sampling)
#> Chain 1: Iteration: 2000 / 2000 [100%] (Sampling)
#> Chain 1:
#> Chain 1: Elapsed Time: 224.27 seconds (Warm-up)
#> Chain 1: 92.83 seconds (Sampling)
#> Chain 1: 317.1 seconds (Total)
#> Chain 1:
#>
#> SAMPLING FOR MODEL 'continuous' NOW (CHAIN 2).
#> Chain 2:
#> Chain 2: Gradient evaluation took 0 seconds
#> Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 0 seconds.
#> Chain 2: Adjust your expectations accordingly!
#> Chain 2:
#> Chain 2:
#> Chain 2: Iteration: 1 / 2000 [ 0%] (Warmup)
#> Chain 2: Iteration: 200 / 2000 [ 10%] (Warmup)
#> Chain 2: Iteration: 400 / 2000 [ 20%] (Warmup)
#> Chain 2: Iteration: 600 / 2000 [ 30%] (Warmup)
#> Chain 2: Iteration: 800 / 2000 [ 40%] (Warmup)
#> Chain 2: Iteration: 1000 / 2000 [ 50%] (Warmup)
#> Chain 2: Iteration: 1001 / 2000 [ 50%] (Sampling)
#> Chain 2: Iteration: 1200 / 2000 [ 60%] (Sampling)
#> Chain 2: Iteration: 1400 / 2000 [ 70%] (Sampling)
#> Chain 2: Iteration: 1600 / 2000 [ 80%] (Sampling)
#> Chain 2: Iteration: 1800 / 2000 [ 90%] (Sampling)
#> Chain 2: Iteration: 2000 / 2000 [100%] (Sampling)
#> Chain 2:
#> Chain 2: Elapsed Time: 222.87 seconds (Warm-up)
#> Chain 2: 52.78 seconds (Sampling)
#> Chain 2: 275.65 seconds (Total)
#> Chain 2:
#>
#> SAMPLING FOR MODEL 'continuous' NOW (CHAIN 3).
#> Chain 3:
#> Chain 3: Gradient evaluation took 0 seconds
#> Chain 3: 1000 transitions using 10 leapfrog steps per transition would take 0 seconds.
#> Chain 3: Adjust your expectations accordingly!
#> Chain 3:
#> Chain 3:
#> Chain 3: Iteration: 1 / 2000 [ 0%] (Warmup)
#> Chain 3: Iteration: 200 / 2000 [ 10%] (Warmup)
#> Chain 3: Iteration: 400 / 2000 [ 20%] (Warmup)
#> Chain 3: Iteration: 600 / 2000 [ 30%] (Warmup)
#> Chain 3: Iteration: 800 / 2000 [ 40%] (Warmup)
#> Chain 3: Iteration: 1000 / 2000 [ 50%] (Warmup)
#> Chain 3: Iteration: 1001 / 2000 [ 50%] (Sampling)
#> Chain 3: Iteration: 1200 / 2000 [ 60%] (Sampling)
#> Chain 3: Iteration: 1400 / 2000 [ 70%] (Sampling)
#> Chain 3: Iteration: 1600 / 2000 [ 80%] (Sampling)
#> Chain 3: Iteration: 1800 / 2000 [ 90%] (Sampling)
#> Chain 3: Iteration: 2000 / 2000 [100%] (Sampling)
#> Chain 3:
#> Chain 3: Elapsed Time: 174.51 seconds (Warm-up)
#> Chain 3: 92.81 seconds (Sampling)
#> Chain 3: 267.32 seconds (Total)
#> Chain 3:
#>
#> SAMPLING FOR MODEL 'continuous' NOW (CHAIN 4).
#> Chain 4:
#> Chain 4: Gradient evaluation took 0 seconds
#> Chain 4: 1000 transitions using 10 leapfrog steps per transition would take 0 seconds.
#> Chain 4: Adjust your expectations accordingly!
#> Chain 4:
#> Chain 4:
#> Chain 4: Iteration: 1 / 2000 [ 0%] (Warmup)
#> Chain 4: Iteration: 200 / 2000 [ 10%] (Warmup)
#> Chain 4: Iteration: 400 / 2000 [ 20%] (Warmup)
#> Chain 4: Iteration: 600 / 2000 [ 30%] (Warmup)
#> Chain 4: Iteration: 800 / 2000 [ 40%] (Warmup)
#> Chain 4: Iteration: 1000 / 2000 [ 50%] (Warmup)
#> Chain 4: Iteration: 1001 / 2000 [ 50%] (Sampling)
#> Chain 4: Iteration: 1200 / 2000 [ 60%] (Sampling)
#> Chain 4: Iteration: 1400 / 2000 [ 70%] (Sampling)
#> Chain 4: Iteration: 1600 / 2000 [ 80%] (Sampling)
#> Chain 4: Iteration: 1800 / 2000 [ 90%] (Sampling)
#> Chain 4: Iteration: 2000 / 2000 [100%] (Sampling)
#> Chain 4:
#> Chain 4: Elapsed Time: 206.03 seconds (Warm-up)
#> Chain 4: 92.84 seconds (Sampling)
#> Chain 4: 298.87 seconds (Total)
#> Chain 4:
#> Warning: There were 3 divergent transitions after warmup. See
#> http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup
#> to find out why this is a problem and how to eliminate them.
#> Warning: Examine the pairs() plot to diagnose sampling problems
mcmc_trace(lm_stan_hier, pars = "(Intercept)") /
mcmc_trace(lm_stan_hier, pars = "sigma")
mcmc_trace(lm_stan_hier, regex_pars = "DUSP1")
pp_check(lm_stan_hier, plotfun = "stat", stat = "mean")
pp_check(lm_stan_hier, plotfun = "stat_2d", stat = c("mean", "sd"))
lm_dusp_post <- as.data.frame(lm_stan_hier) %.%
{
mutate(draw = row_number())
select(draw, `(Intercept)`, tidyselect::contains("DUSP"))
pivot_longer(
-c(draw, `(Intercept)`),
names_to = "parameter",
values_to = "value"
)
mutate(
parameter = str_remove_all(parameter, "[:punct:]"),
parameter = str_remove_all(parameter, "b|allele|hugosymbol|")
)
separate(parameter, c("allele", "dusp"), sep = " ")
mutate(allele = str_replace(allele, "Intercept", "WT"))
}
lm_dusp_post %>%
ggplot(aes(x = value)) +
facet_wrap(~dusp, scales = "free") +
geom_vline(
xintercept = 0,
lty = 2,
color = "grey50",
size = 0.9
) +
geom_density(aes(color = allele), size = 1) +
scale_color_manual(values = short_allele_pal) +
scale_x_continuous(expand = c(0, 0)) +
scale_y_continuous(expand = expansion(c(0, 0.02))) +
labs(
x = "posterior distribution",
y = "probability density",
color = "KRAS allele"
)
lm_dusp_post %.%
{
group_by(dusp, allele)
summarise(
avg = median(value),
hdi_50 = list(unlist(ggdist::hdi(value, .width = 0.50))),
hdi_89 = list(unlist(ggdist::hdi(value, .width = 0.89))),
)
ungroup()
mutate(
hdi_50_lower = map_dbl(hdi_50, ~ .x[[1]]),
hdi_50_upper = map_dbl(hdi_50, ~ .x[[2]]),
hdi_89_lower = map_dbl(hdi_89, ~ .x[[1]]),
hdi_89_upper = map_dbl(hdi_89, ~ .x[[2]])
)
select(-hdi_50, -hdi_89)
} %>%
ggplot(aes(x = avg, y = allele, color = allele)) +
facet_wrap(~ dusp) +
geom_rect(
xmin = -0.1, xmax = 0.1, ymin = Inf, ymax = -Inf,
fill = "grey80",
color = NA,
alpha = 0.1
) +
geom_vline(xintercept = 0, color= "grey50") +
geom_point(size = 2) +
geom_linerange(
aes(xmin = hdi_50_lower, xmax = hdi_50_upper),
size = 1.2,
alpha = 0.8
) +
geom_linerange(
aes(xmin = hdi_89_lower, xmax = hdi_89_upper),
size = 0.9,
alpha = 0.5
) +
scale_color_manual(values = short_allele_pal, drop = TRUE) +
theme(legend.position = "none") +
labs(
x = "posterior distributions",
y = NULL
)
bayestestR::describe_posterior(lm_stan_hier, effects = "all") %>%
arrange(Effects) %>%
as_tibble() %>%
mutate(
Parameter = str_remove_all(Parameter, "[:punct:]"),
Parameter = str_remove_all(Parameter, "^b|allele|hugosymbol"),
) %>%
mutate_if(is.numeric, round, digits = 2)
#> Possible multicollinearity between Sigma[hugo_symbol:alleleG12D,(Intercept)] and Sigma[hugo_symbol:(Intercept),(Intercept)] (r = 0.79), Sigma[hugo_symbol:alleleG12V,(Intercept)] and Sigma[hugo_symbol:(Intercept),(Intercept)] (r = 0.71), Sigma[hugo_symbol:alleleG12D,alleleG12D] and Sigma[hugo_symbol:alleleG12D,(Intercept)] (r = 0.77), Sigma[hugo_symbol:alleleG12V,alleleG12V] and Sigma[hugo_symbol:alleleG12V,(Intercept)] (r = 0.72), Sigma[hugo_symbol:alleleG13D,alleleG12D] and Sigma[hugo_symbol:alleleG13D,(Intercept)] (r = 0.72), Sigma[hugo_symbol:alleleG13D,alleleG13D] and Sigma[hugo_symbol:alleleG13D,(Intercept)] (r = 0.72), Sigma[hugo_symbol:alleleOther,alleleG12D] and Sigma[hugo_symbol:alleleOther,(Intercept)] (r = 0.74). This might lead to inappropriate results. See 'Details' in '?rope'.